From bc7d2f0c08907a9b1554bdb4ffe90d07e009963c Mon Sep 17 00:00:00 2001 From: Guolin Ke Date: Tue, 3 Mar 2020 15:56:02 +0800 Subject: [PATCH] speed up for const hessian (#2857) * speed up for const hessian * rename template * fix clang build * template init * add comment --- include/LightGBM/bin.h | 27 ++-- include/LightGBM/dataset.h | 54 ++++++-- src/io/dataset.cpp | 213 +++++++++++++------------------- src/io/dense_bin.hpp | 40 +++--- src/io/dense_nbits_bin.hpp | 42 ++++--- src/io/multi_val_dense_bin.hpp | 78 ++++++------ src/io/multi_val_sparse_bin.hpp | 69 +++++------ src/io/sparse_bin.hpp | 14 ++- 8 files changed, 272 insertions(+), 265 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 0528b592fea..db84b274b91 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -30,6 +30,9 @@ enum MissingType { }; typedef double hist_t; +typedef uint64_t hist_cnt_t; +// check at compile time +static_assert(sizeof(hist_t) == sizeof(hist_cnt_t), "Histogram entry size is not correct"); const size_t kHistEntrySize = 2 * sizeof(hist_t); const int kHistOffset = 2; @@ -482,20 +485,24 @@ class MultiValBin { const std::vector& lower, const std::vector& upper, const std::vector& delta) = 0; - virtual void ConstructHistogram( - const data_size_t* data_indices, data_size_t start, data_size_t end, - const score_t* gradients, const score_t* hessians, - hist_t* out) const = 0; + virtual void ConstructHistogram(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* hessians, + hist_t* out) const = 0; virtual void ConstructHistogram(data_size_t start, data_size_t end, - const score_t* gradients, const score_t* hessians, - hist_t* out) const = 0; + const score_t* gradients, + const score_t* hessians, + hist_t* out) const = 0; - virtual void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, - const score_t* ordered_gradients, hist_t* out) const = 0; - virtual void ConstructHistogram(data_size_t start, data_size_t end, - const score_t* ordered_gradients, hist_t* out) const = 0; + virtual void ConstructHistogramOrdered(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* ordered_gradients, + const score_t* ordered_hessians, + hist_t* out) const = 0; + virtual void FinishLoad() = 0; diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 0b05725ef7e..b6fe917d194 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -482,20 +482,56 @@ class Dataset { void InitTrain(const std::vector& is_feature_used, TrainingShareStates* share_state) const; - void ConstructHistograms(const std::vector& is_feature_used, - const data_size_t* data_indices, - data_size_t num_data, const score_t* gradients, - const score_t* hessians, score_t* ordered_gradients, - score_t* ordered_hessians, - TrainingShareStates* share_state, - hist_t* histogram_data) const; - + template + void ConstructHistogramsInner(const std::vector& is_feature_used, + const data_size_t* data_indices, + data_size_t num_data, const score_t* gradients, + const score_t* hessians, + score_t* ordered_gradients, + score_t* ordered_hessians, + TrainingShareStates* share_state, + hist_t* hist_data) const; + + template void ConstructHistogramsMultiVal(const data_size_t* data_indices, data_size_t num_data, const score_t* gradients, const score_t* hessians, TrainingShareStates* share_state, - hist_t* histogram_data) const; + hist_t* hist_data) const; + + inline void ConstructHistograms( + const std::vector& is_feature_used, + const data_size_t* data_indices, data_size_t num_data, + const score_t* gradients, const score_t* hessians, + score_t* ordered_gradients, score_t* ordered_hessians, + TrainingShareStates* share_state, hist_t* hist_data) const { + if (num_data <= 0) { + return; + } + bool use_indices = data_indices != nullptr && (num_data < num_data_); + if (share_state->is_constant_hessian) { + if (use_indices) { + ConstructHistogramsInner( + is_feature_used, data_indices, num_data, gradients, hessians, + ordered_gradients, ordered_hessians, share_state, hist_data); + } else { + ConstructHistogramsInner( + is_feature_used, data_indices, num_data, gradients, hessians, + ordered_gradients, ordered_hessians, share_state, hist_data); + } + } else { + if (use_indices) { + ConstructHistogramsInner( + is_feature_used, data_indices, num_data, gradients, hessians, + ordered_gradients, ordered_hessians, share_state, hist_data); + } else { + ConstructHistogramsInner( + is_feature_used, data_indices, num_data, gradients, hessians, + ordered_gradients, ordered_hessians, share_state, hist_data); + } + } + } void FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 89ff05776b9..5af66c8960c 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -651,8 +651,9 @@ TrainingShareStates* Dataset::GetShareStates( hist_data.data()); col_wise_time = std::chrono::steady_clock::now() - start_time; start_time = std::chrono::steady_clock::now(); - ConstructHistogramsMultiVal(nullptr, num_data_, gradients, hessians, - rowwise_state.get(), hist_data.data()); + ConstructHistograms(is_feature_used, nullptr, num_data_, gradients, + hessians, gradients, hessians, rowwise_state.get(), + hist_data.data()); row_wise_time = std::chrono::steady_clock::now() - start_time; Log::Debug("col-wise cost %f seconds, row-wise cost %f seconds", col_wise_time * 1e-3, row_wise_time * 1e-3); @@ -1193,6 +1194,7 @@ void Dataset::InitTrain(const std::vector& is_feature_used, } } +template void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, data_size_t num_data, const score_t* gradients, @@ -1237,21 +1239,17 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, static_cast(num_bin_aligned) * 2 * (tid - 1); } std::memset(reinterpret_cast(data_ptr), 0, num_bin * kHistEntrySize); - if (data_indices != nullptr && num_data < num_data_) { - if (!share_state->is_constant_hessian) { - multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, - hessians, data_ptr); + if (USE_INDICES) { + if (ORDERED) { + multi_val_bin->ConstructHistogramOrdered(data_indices, start, end, + gradients, hessians, data_ptr); } else { multi_val_bin->ConstructHistogram(data_indices, start, end, gradients, - data_ptr); + hessians, data_ptr); } } else { - if (!share_state->is_constant_hessian) { - multi_val_bin->ConstructHistogram(start, end, gradients, hessians, - data_ptr); - } else { - multi_val_bin->ConstructHistogram(start, end, gradients, data_ptr); - } + multi_val_bin->ConstructHistogram(start, end, gradients, hessians, + data_ptr); } OMP_LOOP_EX_END(); } @@ -1263,33 +1261,15 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, int bin_block_size = num_bin; Threading::BlockInfo(share_state->num_threads, num_bin, 512, &n_bin_block, &bin_block_size); - if (!share_state->is_constant_hessian) { #pragma omp parallel for schedule(static) - for (int t = 0; t < n_bin_block; ++t) { - const int start = t * bin_block_size; - const int end = std::min(start + bin_block_size, num_bin); - for (int tid = 1; tid < n_data_block; ++tid) { - auto src_ptr = share_state->hist_buf.data() + - static_cast(num_bin_aligned) * 2 * (tid - 1); - for (int i = start * 2; i < end * 2; ++i) { - hist_data[i] += src_ptr[i]; - } - } - } - } else { -#pragma omp parallel for schedule(static) - for (int t = 0; t < n_bin_block; ++t) { - const int start = t * bin_block_size; - const int end = std::min(start + bin_block_size, num_bin); - for (int tid = 1; tid < n_data_block; ++tid) { - auto src_ptr = share_state->hist_buf.data() + - static_cast(num_bin_aligned) * 2 * (tid - 1); - for (int i = start * 2; i < end * 2; ++i) { - hist_data[i] += src_ptr[i]; - } - } - for (int i = start; i < end; ++i) { - GET_HESS(hist_data, i) = GET_HESS(hist_data, i) * hessians[0]; + for (int t = 0; t < n_bin_block; ++t) { + const int start = t * bin_block_size; + const int end = std::min(start + bin_block_size, num_bin); + for (int tid = 1; tid < n_data_block; ++tid) { + auto src_ptr = share_state->hist_buf.data() + + static_cast(num_bin_aligned) * 2 * (tid - 1); + for (int i = start * 2; i < end * 2; ++i) { + hist_data[i] += src_ptr[i]; } } } @@ -1299,20 +1279,16 @@ void Dataset::ConstructHistogramsMultiVal(const data_size_t* data_indices, global_timer.Stop("Dataset::sparse_bin_histogram_move"); } -void Dataset::ConstructHistograms( +template +void Dataset::ConstructHistogramsInner( const std::vector& is_feature_used, const data_size_t* data_indices, data_size_t num_data, const score_t* gradients, const score_t* hessians, score_t* ordered_gradients, score_t* ordered_hessians, TrainingShareStates* share_state, hist_t* hist_data) const { - Common::FunctionTimer fun_timer("Dataset::ConstructHistograms", global_timer); - if (num_data < 0 || hist_data == nullptr) { - return; - } if (!share_state->is_colwise) { - return ConstructHistogramsMultiVal(data_indices, num_data, gradients, - hessians, share_state, hist_data); + return ConstructHistogramsMultiVal( + data_indices, num_data, gradients, hessians, share_state, hist_data); } - global_timer.Start("Dataset::Get used group"); std::vector used_dense_group; int multi_val_groud_id = -1; used_dense_group.reserve(num_groups_); @@ -1335,117 +1311,102 @@ void Dataset::ConstructHistograms( } } int num_used_dense_group = static_cast(used_dense_group.size()); - global_timer.Stop("Dataset::Get used group"); global_timer.Start("Dataset::dense_bin_histogram"); + auto ptr_ordered_grad = gradients; + auto ptr_ordered_hess = hessians; if (num_used_dense_group > 0) { - auto ptr_ordered_grad = gradients; - auto ptr_ordered_hess = hessians; - if (data_indices != nullptr && num_data < num_data_) { - if (!share_state->is_constant_hessian) { + if (USE_INDICES) { + if (USE_HESSIAN) { #pragma omp parallel for schedule(static, 512) if (num_data >= 1024) for (data_size_t i = 0; i < num_data; ++i) { ordered_gradients[i] = gradients[data_indices[i]]; ordered_hessians[i] = hessians[data_indices[i]]; } + ptr_ordered_grad = ordered_gradients; + ptr_ordered_hess = ordered_hessians; } else { #pragma omp parallel for schedule(static, 512) if (num_data >= 1024) for (data_size_t i = 0; i < num_data; ++i) { ordered_gradients[i] = gradients[data_indices[i]]; } + ptr_ordered_grad = ordered_gradients; } - ptr_ordered_grad = ordered_gradients; - ptr_ordered_hess = ordered_hessians; - if (!share_state->is_constant_hessian) { - OMP_INIT_EX(); + } + OMP_INIT_EX(); #pragma omp parallel for schedule(static) - for (int gi = 0; gi < num_used_dense_group; ++gi) { - OMP_LOOP_EX_BEGIN(); - int group = used_dense_group[gi]; - // feature is not used - auto data_ptr = hist_data + group_bin_boundaries_[group] * 2; - const int num_bin = feature_groups_[group]->num_total_bin_; - std::memset(reinterpret_cast(data_ptr), 0, - num_bin * kHistEntrySize); - // construct histograms for smaller leaf + for (int gi = 0; gi < num_used_dense_group; ++gi) { + OMP_LOOP_EX_BEGIN(); + int group = used_dense_group[gi]; + auto data_ptr = hist_data + group_bin_boundaries_[group] * 2; + const int num_bin = feature_groups_[group]->num_total_bin_; + std::memset(reinterpret_cast(data_ptr), 0, + num_bin * kHistEntrySize); + if (USE_HESSIAN) { + if (USE_INDICES) { feature_groups_[group]->bin_data_->ConstructHistogram( data_indices, 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); - OMP_LOOP_EX_END(); - } - OMP_THROW_EX(); - - } else { - OMP_INIT_EX(); -#pragma omp parallel for schedule(static) - for (int gi = 0; gi < num_used_dense_group; ++gi) { - OMP_LOOP_EX_BEGIN(); - int group = used_dense_group[gi]; - // feature is not used - auto data_ptr = hist_data + group_bin_boundaries_[group] * 2; - const int num_bin = feature_groups_[group]->num_total_bin_; - std::memset(reinterpret_cast(data_ptr), 0, - num_bin * kHistEntrySize); - // construct histograms for smaller leaf - feature_groups_[group]->bin_data_->ConstructHistogram( - data_indices, 0, num_data, ptr_ordered_grad, data_ptr); - // fixed hessian. - for (int i = 0; i < num_bin; ++i) { - GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0]; - } - OMP_LOOP_EX_END(); - } - OMP_THROW_EX(); - } - } else { - if (!share_state->is_constant_hessian) { - OMP_INIT_EX(); -#pragma omp parallel for schedule(static) - for (int gi = 0; gi < num_used_dense_group; ++gi) { - OMP_LOOP_EX_BEGIN(); - int group = used_dense_group[gi]; - // feature is not used - auto data_ptr = hist_data + group_bin_boundaries_[group] * 2; - const int num_bin = feature_groups_[group]->num_total_bin_; - std::memset(reinterpret_cast(data_ptr), 0, - num_bin * kHistEntrySize); - // construct histograms for smaller leaf + } else { feature_groups_[group]->bin_data_->ConstructHistogram( 0, num_data, ptr_ordered_grad, ptr_ordered_hess, data_ptr); - OMP_LOOP_EX_END(); } - OMP_THROW_EX(); } else { - OMP_INIT_EX(); -#pragma omp parallel for schedule(static) - for (int gi = 0; gi < num_used_dense_group; ++gi) { - OMP_LOOP_EX_BEGIN(); - int group = used_dense_group[gi]; - // feature is not used - auto data_ptr = hist_data + group_bin_boundaries_[group] * 2; - const int num_bin = feature_groups_[group]->num_total_bin_; - std::memset(reinterpret_cast(data_ptr), 0, - num_bin * kHistEntrySize); - // construct histograms for smaller leaf + if (USE_INDICES) { + feature_groups_[group]->bin_data_->ConstructHistogram( + data_indices, 0, num_data, ptr_ordered_grad, data_ptr); + } else { feature_groups_[group]->bin_data_->ConstructHistogram( 0, num_data, ptr_ordered_grad, data_ptr); - // fixed hessian. - for (int i = 0; i < num_bin; ++i) { - GET_HESS(data_ptr, i) = GET_HESS(data_ptr, i) * hessians[0]; - } - OMP_LOOP_EX_END(); } - OMP_THROW_EX(); + auto cnt_dst = reinterpret_cast(data_ptr + 1); + for (int i = 0; i < num_bin * 2; i += 2) { + data_ptr[i + 1] = static_cast(cnt_dst[i]) * hessians[0]; + } } + OMP_LOOP_EX_END(); } + OMP_THROW_EX(); } global_timer.Stop("Dataset::dense_bin_histogram"); if (multi_val_groud_id >= 0) { - ConstructHistogramsMultiVal( - data_indices, num_data, gradients, hessians, share_state, - hist_data + group_bin_boundaries_[multi_val_groud_id] * 2); + if (num_used_dense_group > 0) { + ConstructHistogramsMultiVal( + data_indices, num_data, ptr_ordered_grad, ptr_ordered_hess, + share_state, + hist_data + group_bin_boundaries_[multi_val_groud_id] * 2); + } else { + ConstructHistogramsMultiVal( + data_indices, num_data, gradients, hessians, share_state, + hist_data + group_bin_boundaries_[multi_val_groud_id] * 2); + } } } +// explicitly initilize template methods, for cross module call +template void Dataset::ConstructHistogramsInner( + const std::vector& is_feature_used, const data_size_t* data_indices, + data_size_t num_data, const score_t* gradients, const score_t* hessians, + score_t* ordered_gradients, score_t* ordered_hessians, + TrainingShareStates* share_state, hist_t* hist_data) const; + +template void Dataset::ConstructHistogramsInner( + const std::vector& is_feature_used, const data_size_t* data_indices, + data_size_t num_data, const score_t* gradients, const score_t* hessians, + score_t* ordered_gradients, score_t* ordered_hessians, + TrainingShareStates* share_state, hist_t* hist_data) const; + +template void Dataset::ConstructHistogramsInner( + const std::vector& is_feature_used, const data_size_t* data_indices, + data_size_t num_data, const score_t* gradients, const score_t* hessians, + score_t* ordered_gradients, score_t* ordered_hessians, + TrainingShareStates* share_state, hist_t* hist_data) const; + +template void Dataset::ConstructHistogramsInner( + const std::vector& is_feature_used, const data_size_t* data_indices, + data_size_t num_data, const score_t* gradients, const score_t* hessians, + score_t* ordered_gradients, score_t* ordered_hessians, + TrainingShareStates* share_state, hist_t* hist_data) const; + void Dataset::FixHistogram(int feature_idx, double sum_gradient, double sum_hessian, hist_t* data) const { const int group = feature2group_[feature_idx]; diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index 128501b0159..f580c2d3d3e 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -68,42 +68,42 @@ class DenseBin: public Bin { BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override; - #define ACC_GH(hist, i, g, h) \ - const auto ti = static_cast(i) << 1; \ - hist[ti] += g; \ - hist[ti + 1] += h; \ - - template + template void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const { data_size_t i = start; - - if (use_prefetch) { + hist_t* grad = out; + hist_t* hess = out + 1; + hist_cnt_t* cnt = reinterpret_cast(hess); + if (USE_PREFETCH) { const data_size_t pf_offset = 64 / sizeof(VAL_T); const data_size_t pf_end = end - pf_offset; for (; i < pf_end; ++i) { - const auto idx = use_indices ? data_indices[i] : i; - const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset; + const auto idx = USE_INDICES ? data_indices[i] : i; + const auto pf_idx = USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset; PREFETCH_T0(data_.data() + pf_idx); - const VAL_T bin = data_[idx]; - if (use_hessians) { - ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]); + const auto ti = static_cast(data_[idx]) << 1; + if (USE_HESSIAN) { + grad[ti] += ordered_gradients[i]; + hess[ti] += ordered_hessians[i]; } else { - ACC_GH(out, bin, ordered_gradients[i], 1.0f); + grad[ti] += ordered_gradients[i]; + ++cnt[ti]; } } } for (; i < end; ++i) { - const auto idx = use_indices ? data_indices[i] : i; - const VAL_T bin = data_[idx]; - if (use_hessians) { - ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]); + const auto idx = USE_INDICES ? data_indices[i] : i; + const auto ti = static_cast(data_[idx]) << 1; + if (USE_HESSIAN) { + grad[ti] += ordered_gradients[i]; + hess[ti] += ordered_hessians[i]; } else { - ACC_GH(out, bin, ordered_gradients[i], 1.0f); + grad[ti] += ordered_gradients[i]; + ++cnt[ti]; } } } - #undef ACC_GH void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, diff --git a/src/io/dense_nbits_bin.hpp b/src/io/dense_nbits_bin.hpp index f2829c8080b..4d97978486f 100644 --- a/src/io/dense_nbits_bin.hpp +++ b/src/io/dense_nbits_bin.hpp @@ -73,42 +73,44 @@ class Dense4bitsBin : public Bin { inline BinIterator* GetIterator(uint32_t min_bin, uint32_t max_bin, uint32_t most_freq_bin) const override; - #define ACC_GH(hist, i, g, h) \ - const auto ti = (i) << 1; \ - hist[ti] += g; \ - hist[ti + 1] += h; \ - - template + template void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, hist_t* out) const { data_size_t i = start; - - if (use_prefetch) { + hist_t* grad = out; + hist_t* hess = out + 1; + hist_cnt_t* cnt = reinterpret_cast(hess); + if (USE_PREFETCH) { const data_size_t pf_offset = 64; const data_size_t pf_end = end - pf_offset; for (; i < pf_end; ++i) { - const auto idx = use_indices ? data_indices[i] : i; - const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset; + const auto idx = USE_INDICES ? data_indices[i] : i; + const auto pf_idx = USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset; PREFETCH_T0(data_.data() + (pf_idx >> 1)); - const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; - if (use_hessians) { - ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]); + const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; + const uint8_t ti = static_cast(bin) << 1; + if (USE_HESSIAN) { + grad[ti] += ordered_gradients[i]; + hess[ti] += ordered_hessians[i]; } else { - ACC_GH(out, bin, ordered_gradients[i], 1.0f); + grad[ti] += ordered_gradients[i]; + ++cnt[ti]; } } } for (; i < end; ++i) { - const auto idx = use_indices ? data_indices[i] : i; - const auto bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; - if (use_hessians) { - ACC_GH(out, bin, ordered_gradients[i], ordered_hessians[i]); + const auto idx = USE_INDICES ? data_indices[i] : i; + const uint8_t bin = (data_[idx >> 1] >> ((idx & 1) << 2)) & 0xf; + const uint8_t ti = static_cast(bin) << 1; + if (USE_HESSIAN) { + grad[ti] += ordered_gradients[i]; + hess[ti] += ordered_hessians[i]; } else { - ACC_GH(out, bin, ordered_gradients[i], 1.0f); + grad[ti] += ordered_gradients[i]; + ++cnt[ti]; } } } - #undef ACC_GH void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* ordered_gradients, const score_t* ordered_hessians, diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp index 8427f21c332..4b741957f79 100644 --- a/src/io/multi_val_dense_bin.hpp +++ b/src/io/multi_val_dense_bin.hpp @@ -50,75 +50,75 @@ class MultiValDenseBin : public MultiValBin { return false; } - #define ACC_GH(hist, i, g, h) \ - const auto ti = static_cast(i) << 1; \ - hist[ti] += g; \ - hist[ti + 1] += h; \ - template + template void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* gradients, const score_t* hessians, hist_t* out) const { data_size_t i = start; - if (use_prefetch) { + hist_t* grad = out; + hist_t* hess = out + 1; + if (USE_PREFETCH) { const data_size_t pf_offset = 32 / sizeof(VAL_T); const data_size_t pf_end = end - pf_offset; for (; i < pf_end; ++i) { - const auto idx = use_indices ? data_indices[i] : i; - const auto pf_idx = use_indices ? data_indices[i + pf_offset] : i + pf_offset; - PREFETCH_T0(gradients + pf_idx); - if (use_hessians) { + const auto idx = USE_INDICES ? data_indices[i] : i; + const auto pf_idx = USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset; + if (!ORDERED) { + PREFETCH_T0(gradients + pf_idx); PREFETCH_T0(hessians + pf_idx); } PREFETCH_T0(data_.data() + RowPtr(pf_idx)); const auto j_start = RowPtr(idx); for (auto j = j_start; j < j_start + num_feature_; ++j) { - const VAL_T bin = data_[j]; - if (use_hessians) { - ACC_GH(out, bin, gradients[idx], hessians[idx]); + const auto ti = static_cast(data_[j]) << 1; + if (ORDERED) { + grad[ti] += gradients[i]; + hess[ti] += hessians[i]; } else { - ACC_GH(out, bin, gradients[idx], 1.0f); + grad[ti] += gradients[idx]; + hess[ti] += hessians[idx]; } } } } for (; i < end; ++i) { - const auto idx = use_indices ? data_indices[i] : i; + const auto idx = USE_INDICES ? data_indices[i] : i; const auto j_start = RowPtr(idx); for (auto j = j_start; j < j_start + num_feature_; ++j) { - const VAL_T bin = data_[j]; - if (use_hessians) { - ACC_GH(out, bin, gradients[idx], hessians[idx]); + const auto ti = static_cast(data_[j]) << 1; + if (ORDERED) { + grad[ti] += gradients[i]; + hess[ti] += hessians[i]; } else { - ACC_GH(out, bin, gradients[idx], 1.0f); + grad[ti] += gradients[idx]; + hess[ti] += hessians[idx]; } } } } - #undef ACC_GH - void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, - const score_t* gradients, const score_t* hessians, - hist_t* out) const override { - ConstructHistogramInner(data_indices, start, end, gradients, hessians, out); + void ConstructHistogram(const data_size_t* data_indices, data_size_t start, + data_size_t end, const score_t* gradients, + const score_t* hessians, hist_t* out) const override { + ConstructHistogramInner(data_indices, start, end, + gradients, hessians, out); } void ConstructHistogram(data_size_t start, data_size_t end, - const score_t* gradients, const score_t* hessians, - hist_t* out) const override { - ConstructHistogramInner(nullptr, start, end, gradients, hessians, out); - } - - void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, - const score_t* gradients, - hist_t* out) const override { - ConstructHistogramInner(data_indices, start, end, gradients, nullptr, out); - } - - void ConstructHistogram(data_size_t start, data_size_t end, - const score_t* gradients, - hist_t* out) const override { - ConstructHistogramInner(nullptr, start, end, gradients, nullptr, out); + const score_t* gradients, const score_t* hessians, + hist_t* out) const override { + ConstructHistogramInner( + nullptr, start, end, gradients, hessians, out); + } + + void ConstructHistogramOrdered(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* hessians, + hist_t* out) const override { + ConstructHistogramInner(data_indices, start, end, + gradients, hessians, out); } MultiValBin* CreateLike(data_size_t num_data, int num_bin, int num_feature, double) const override { diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp index c32ca3179ce..649371f2e16 100644 --- a/src/io/multi_val_sparse_bin.hpp +++ b/src/io/multi_val_sparse_bin.hpp @@ -106,27 +106,24 @@ class MultiValSparseBin : public MultiValBin { bool IsSparse() override { return true; } -#define ACC_GH(hist, i, g, h) \ - const auto ti = static_cast(i) << 1; \ - hist[ti] += g; \ - hist[ti + 1] += h; - - template + template void ConstructHistogramInner(const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* gradients, const score_t* hessians, hist_t* out) const { data_size_t i = start; - if (use_prefetch) { + hist_t* grad = out; + hist_t* hess = out + 1; + if (USE_PREFETCH) { const data_size_t pf_offset = 32 / sizeof(VAL_T); const data_size_t pf_end = end - pf_offset; for (; i < pf_end; ++i) { - const auto idx = use_indices ? data_indices[i] : i; + const auto idx = USE_INDICES ? data_indices[i] : i; const auto pf_idx = - use_indices ? data_indices[i + pf_offset] : i + pf_offset; - PREFETCH_T0(gradients + pf_idx); - if (use_hessians) { + USE_INDICES ? data_indices[i + pf_offset] : i + pf_offset; + if (!ORDERED) { + PREFETCH_T0(gradients + pf_idx); PREFETCH_T0(hessians + pf_idx); } PREFETCH_T0(row_ptr_.data() + pf_idx); @@ -134,57 +131,55 @@ class MultiValSparseBin : public MultiValBin { const auto j_start = RowPtr(idx); const auto j_end = RowPtr(idx + 1); for (auto j = j_start; j < j_end; ++j) { - const VAL_T bin = data_[j]; - if (use_hessians) { - ACC_GH(out, bin, gradients[idx], hessians[idx]); + const auto ti = static_cast(data_[j]) << 1; + if (ORDERED) { + grad[ti] += gradients[i]; + hess[ti] += hessians[i]; } else { - ACC_GH(out, bin, gradients[idx], 1.0f); + grad[ti] += gradients[idx]; + hess[ti] += hessians[idx]; } } } } for (; i < end; ++i) { - const auto idx = use_indices ? data_indices[i] : i; + const auto idx = USE_INDICES ? data_indices[i] : i; const auto j_start = RowPtr(idx); const auto j_end = RowPtr(idx + 1); for (auto j = j_start; j < j_end; ++j) { - const VAL_T bin = data_[j]; - if (use_hessians) { - ACC_GH(out, bin, gradients[idx], hessians[idx]); + const auto ti = static_cast(data_[j]) << 1; + if (ORDERED) { + grad[ti] += gradients[i]; + hess[ti] += hessians[i]; } else { - ACC_GH(out, bin, gradients[idx], 1.0f); + grad[ti] += gradients[idx]; + hess[ti] += hessians[idx]; } } } } -#undef ACC_GH void ConstructHistogram(const data_size_t* data_indices, data_size_t start, data_size_t end, const score_t* gradients, const score_t* hessians, hist_t* out) const override { - ConstructHistogramInner(data_indices, start, end, - gradients, hessians, out); + ConstructHistogramInner(data_indices, start, end, + gradients, hessians, out); } void ConstructHistogram(data_size_t start, data_size_t end, const score_t* gradients, const score_t* hessians, hist_t* out) const override { - ConstructHistogramInner(nullptr, start, end, gradients, - hessians, out); - } - - void ConstructHistogram(const data_size_t* data_indices, data_size_t start, - data_size_t end, const score_t* gradients, - hist_t* out) const override { - ConstructHistogramInner(data_indices, start, end, - gradients, nullptr, out); + ConstructHistogramInner( + nullptr, start, end, gradients, hessians, out); } - void ConstructHistogram(data_size_t start, data_size_t end, - const score_t* gradients, - hist_t* out) const override { - ConstructHistogramInner(nullptr, start, end, gradients, - nullptr, out); + void ConstructHistogramOrdered(const data_size_t* data_indices, + data_size_t start, data_size_t end, + const score_t* gradients, + const score_t* hessians, + hist_t* out) const override { + ConstructHistogramInner(data_indices, start, end, + gradients, hessians, out); } MultiValBin* CreateLike(data_size_t num_data, int num_bin, int, diff --git a/src/io/sparse_bin.hpp b/src/io/sparse_bin.hpp index 19d1ac4db4d..e7fa9a8e3c8 100644 --- a/src/io/sparse_bin.hpp +++ b/src/io/sparse_bin.hpp @@ -138,6 +138,8 @@ class SparseBin: public Bin { data_size_t i_delta, cur_pos; InitIndex(data_indices[start], &i_delta, &cur_pos); data_size_t i = start; + hist_t* grad = out; + hist_cnt_t* cnt = reinterpret_cast(out + 1); for (;;) { if (cur_pos < data_indices[i]) { cur_pos += deltas_[++i_delta]; @@ -145,8 +147,9 @@ class SparseBin: public Bin { } else if (cur_pos > data_indices[i]) { if (++i >= end) { break; } } else { - const VAL_T bin = vals_[i_delta]; - ACC_GH(out, bin, ordered_gradients[i], 1.0f); + const uint32_t ti = static_cast(vals_[i_delta]) << 1; + grad[ti] += ordered_gradients[i]; + ++cnt[ti]; if (++i >= end) { break; } cur_pos += deltas_[++i_delta]; if (i_delta >= num_vals_) { break; } @@ -158,12 +161,15 @@ class SparseBin: public Bin { const score_t* ordered_gradients, hist_t* out) const override { data_size_t i_delta, cur_pos; InitIndex(start, &i_delta, &cur_pos); + hist_t* grad = out; + hist_cnt_t* cnt = reinterpret_cast(out + 1); while (cur_pos < start && i_delta < num_vals_) { cur_pos += deltas_[++i_delta]; } while (cur_pos < end && i_delta < num_vals_) { - const VAL_T bin = vals_[i_delta]; - ACC_GH(out, bin, ordered_gradients[cur_pos], 1.0f); + const uint32_t ti = static_cast(vals_[i_delta]) << 1; + grad[ti] += ordered_gradients[cur_pos]; + ++cnt[ti]; cur_pos += deltas_[++i_delta]; } }